{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle\n",
    "import dask.array as da\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.model_selection import cross_validate\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "from sklearn import svm\n",
    "from sklearn import neighbors\n",
    "from sklearn import naive_bayes\n",
    "from sklearn.svm import LinearSVC\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import LogisticRegressionCV\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.gaussian_process import GaussianProcessClassifier\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.ensemble import BaggingClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.ensemble import VotingClassifier\n",
    "\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "from sklearn.externals import joblib\n",
    "\n",
    "%config InlineBackend.figure_format = 'svg'\n",
    "%matplotlib inline\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"train_ret_value_tfidf_features.pkl\", \"rb\") as fp:\n",
    "    train_tfidf_features = pickle.load(fp)\n",
    "with open(\"test_ret_value_tfidf_features.pkl\", \"rb\") as fp:\n",
    "    test_tfidf_features = pickle.load(fp)\n",
    "safe_type = pd.read_csv(\"safe_type.csv\", header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "bc_model = BaggingClassifier()\n",
    "gbc_model = GradientBoostingClassifier()\n",
    "lr_model = LogisticRegression()\n",
    "svm_model = svm.LinearSVC()\n",
    "dt_model = DecisionTreeClassifier()\n",
    "xgb_model = XGBClassifier(max_depth=7,\n",
    "                          learning_rate=0.05,\n",
    "                          n_estimators=1000)\n",
    "\n",
    "rfc_model = RandomForestClassifier(200)\n",
    "etc_model = ExtraTreesClassifier()\n",
    "mnb_model = naive_bayes.MultinomialNB(alpha=0.01)\n",
    "ada_model = AdaBoostClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_oof(model, x_train, y_train, x_test, n_splits):\n",
    "    \"\"\"\n",
    "    :@param x_train: feature matrix.\n",
    "    :type x: np.array(M X N) or list(M X N).\n",
    "    :@param y_train: class label.\n",
    "    :type y: int.\n",
    "    :@param x_test: test set feature matrix.\n",
    "    :type x_test: np.array(M X N) or list(M X N).\n",
    "    :@param n_splits: K-fold parameter.\n",
    "    :type n_splits: int.\n",
    "    \"\"\"\n",
    "    n_train, n_test = x_train.shape[0], x_test.shape[0]\n",
    "    kf = StratifiedKFold(n_splits=n_splits, random_state=0)\n",
    "    oof_train = np.empty((n_train, ))\n",
    "    oof_test = np.empty((n_test, ))\n",
    "    oof_test_skf = np.empty((n_splits, n_test))\n",
    "    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):\n",
    "        kf_x_train = x_train[train_index]\n",
    "        kf_y_train = y_train[train_index]\n",
    "        kf_x_test = x_train[test_index]\n",
    "        model.fit(kf_x_train, kf_y_train)\n",
    "        oof_train[test_index] = model.predict(kf_x_test)\n",
    "        oof_test_skf[i, :] = model.predict(x_test)\n",
    "    oof_test[:] = oof_test_skf.mean(axis=0)\n",
    "    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rfc success!\n",
      "etc success!\n",
      "mnb success!\n",
      "ada success!\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    lr_model_oof_train, lr_model_oof_test = get_oof(lr_model, \n",
    "                                                    train_tfidf_features.tolil(), \n",
    "                                                    safe_type.values,\n",
    "                                                    test_tfidf_features.tolil(),\n",
    "                                                    10)\n",
    "    with open(\"ret_value_lr_model_oof_train.csv\", \"wb\") as fp:\n",
    "        pickle.dump(lr_model_oof_train, fp)\n",
    "    with open(\"ret_value_lr_model_oof_test.csv\", \"wb\") as fp:\n",
    "        pickle.dump(lr_model_oof_test, fp)\n",
    "    print(\"lr success!\")\n",
    "except:\n",
    "    print(\"lr error!\")\n",
    "try:\n",
    "    gbc_model_oof_train, gbc_model_oof_test = get_oof(gbc_model, \n",
    "                                                      train_tfidf_features.tolil(), \n",
    "                                                      safe_type.values,\n",
    "                                                      test_tfidf_features.tolil(),\n",
    "                                                      10)\n",
    "    with open(\"ret_value_gbc_model_oof_train.csv\", \"wb\") as fp:\n",
    "        pickle.dump(gbc_model_oof_train, fp)\n",
    "    with open(\"ret_value_gbc_model_oof_test.csv\", \"wb\") as fp:\n",
    "        pickle.dump(gbc_model_oof_test, fp)\n",
    "    print(\"gbc success!\")\n",
    "except:\n",
    "    print(\"gbc error!\")\n",
    "try:\n",
    "    bc_model_oof_train, bc_model_oof_test = get_oof(bc_model, \n",
    "                                                    train_tfidf_features.tolil(), \n",
    "                                                    safe_type.values,\n",
    "                                                    test_tfidf_features.tolil(),\n",
    "                                                    10)\n",
    "    with open(\"ret_value_bc_model_oof_train.csv\", \"wb\") as fp:\n",
    "        pickle.dump(bc_model_oof_train, fp)\n",
    "    with open(\"ret_value_bc_model_oof_test.csv\", \"wb\") as fp:\n",
    "        pickle.dump(bc_model_oof_test, fp)\n",
    "    print(\"bc success!\")\n",
    "except:\n",
    "    print(\"bc error!\")\n",
    "try:\n",
    "    svm_model_oof_train, svm_model_oof_test = get_oof(svm_model, \n",
    "                                                      train_tfidf_features.tolil(), \n",
    "                                                      safe_type.values,\n",
    "                                                      test_tfidf_features.tolil(),\n",
    "                                                      10)\n",
    "    with open(\"ret_value_svm_model_oof_train.csv\", \"wb\") as fp:\n",
    "        pickle.dump(svm_model_oof_train, fp)\n",
    "    with open(\"ret_value_svm_model_oof_test.csv\", \"wb\") as fp:\n",
    "        pickle.dump(svm_model_oof_test, fp)\n",
    "    print(\"svm success!\")\n",
    "except:\n",
    "    print(\"svm error!\")\n",
    "try:\n",
    "    dt_model_oof_train, dt_model_oof_test = get_oof(dt_model, \n",
    "                                                      train_tfidf_features.tolil(), \n",
    "                                                      safe_type.values,\n",
    "                                                      test_tfidf_features.tolil(),\n",
    "                                                      10)\n",
    "    with open(\"ret_value_dt_model_oof_train.csv\", \"wb\") as fp:\n",
    "        pickle.dump(dt_model_oof_train, fp)\n",
    "    with open(\"ret_value_dt_model_oof_test.csv\", \"wb\") as fp:\n",
    "        pickle.dump(dt_model_oof_test, fp)\n",
    "    print(\"dt success!\")\n",
    "except:\n",
    "    print(\"dt error!\")\n",
    "try:\n",
    "    xgb_model_oof_train, xgb_model_oof_test = get_oof(xgb_model, \n",
    "                                                      train_tfidf_features.tolil(), \n",
    "                                                      safe_type.values,\n",
    "                                                      test_tfidf_features.tolil(),\n",
    "                                                      10)\n",
    "    with open(\"ret_value_xgb_model_oof_train.csv\", \"wb\") as fp:\n",
    "        pickle.dump(xgb_model_oof_train, fp)\n",
    "    with open(\"ret_value_xgb_model_oof_test.csv\", \"wb\") as fp:\n",
    "        pickle.dump(xgb_model_oof_test, fp)\n",
    "    print(\"xgb success!\")\n",
    "except:\n",
    "    print(\"xgb error!\")\n",
    "try:\n",
    "    rfc_model_oof_train, rfc_model_oof_test = get_oof(rfc_model, \n",
    "                                                      train_tfidf_features.tolil(), \n",
    "                                                      safe_type.values,\n",
    "                                                      test_tfidf_features.tolil(),\n",
    "                                                      10)\n",
    "    with open(\"ret_value_rfc_model_oof_train.csv\", \"wb\") as fp:\n",
    "        pickle.dump(rfc_model_oof_train, fp)\n",
    "    with open(\"ret_value_rfc_model_oof_test.csv\", \"wb\") as fp:\n",
    "        pickle.dump(rfc_model_oof_test, fp)\n",
    "    print(\"rfc success!\")\n",
    "except:\n",
    "    print(\"rfc error!\")\n",
    "    \n",
    "try:\n",
    "    etc_model_oof_train, etc_model_oof_test = get_oof(etc_model, \n",
    "                                                      train_tfidf_features.tolil(), \n",
    "                                                      safe_type.values,\n",
    "                                                      test_tfidf_features.tolil(),\n",
    "                                                      10)\n",
    "    with open(\"ret_value_etc_model_oof_train.csv\", \"wb\") as fp:\n",
    "        pickle.dump(etc_model_oof_train, fp)\n",
    "    with open(\"ret_value_etc_model_oof_test.csv\", \"wb\") as fp:\n",
    "        pickle.dump(etc_model_oof_test, fp)\n",
    "    print(\"etc success!\")\n",
    "except:\n",
    "    print(\"etc error!\")\n",
    "try:\n",
    "    mnb_model_oof_train, mnb_model_oof_test = get_oof(mnb_model, \n",
    "                                                      train_tfidf_features.tolil(), \n",
    "                                                      safe_type.values,\n",
    "                                                      test_tfidf_features.tolil(),\n",
    "                                                      10)\n",
    "    with open(\"ret_value_mnb_model_oof_train.csv\", \"wb\") as fp:\n",
    "        pickle.dump(mnb_model_oof_train, fp)\n",
    "    with open(\"ret_value_mnb_model_oof_test.csv\", \"wb\") as fp:\n",
    "        pickle.dump(mnb_model_oof_test, fp)\n",
    "    print(\"mnb success!\")\n",
    "except:\n",
    "    print(\"mnb error!\")\n",
    "    \n",
    "try:\n",
    "    ada_model_oof_train, ada_model_oof_test = get_oof(ada_model, \n",
    "                                                      train_tfidf_features.tolil(), \n",
    "                                                      safe_type.values,\n",
    "                                                      test_tfidf_features.tolil(),\n",
    "                                                      10)\n",
    "    with open(\"ret_value_ada_model_oof_train.csv\", \"wb\") as fp:\n",
    "        pickle.dump(ada_model_oof_train, fp)\n",
    "    with open(\"ret_value_ada_model_oof_test.csv\", \"wb\") as fp:\n",
    "        pickle.dump(ada_model_oof_test, fp)\n",
    "    print(\"ada success!\")\n",
    "except:\n",
    "    print(\"ada error!\")\n",
    "\n",
    "\n",
    "ret_value_stacking_train_10 = np.hstack([lr_model_oof_train, gbc_model_oof_train, bc_model_oof_train,\n",
    "                            svm_model_oof_train, xgb_model_oof_train, dt_model_oof_train,\n",
    "                            rfc_model_oof_train, etc_model_oof_train, mnb_model_oof_train,\n",
    "                            ada_model_oof_train])\n",
    "ret_value_stacking_test_10 = np.hstack([lr_model_oof_test, gbc_model_oof_test, bc_model_oof_test,\n",
    "                           svm_model_oof_test, xgb_model_oof_test, dt_model_oof_test,\n",
    "                           rfc_model_oof_test, etc_model_oof_test, mnb_model_oof_test,\n",
    "                           ada_model_oof_test])\n",
    "with open(\"ret_value_stacking_train_10.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(ret_value_stacking_train_10, fp)\n",
    "    \n",
    "with open(\"ret_value_stacking_test_10.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(ret_value_stacking_test_10, fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr = pd.read_pickle(\"exinfos_lr_model_oof_train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9251.0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lr.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9464.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lr.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = pd.read_pickle(\"api_name_and_ret_value_stacked_mix_train.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([9669., 9726., 9800., 9817., 9992., 9650., 9872., 9582., 9836.,\n",
       "       9844., 9128., 9116., 9575., 9559., 9885., 9487., 9613., 9308.,\n",
       "       9653., 9425.])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t.sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
